In [1]:
%pylab inline
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import decomposition
from sklearn import linear_model
from sklearn import tree


Populating the interactive namespace from numpy and matplotlib

In [2]:
train = pd.read_csv('train.csv')
new_labels = train.columns.values
new_labels[-1] = 'total_rentals'
train.columns = new_labels
train[:5]


Out[2]:
datetime season holiday workingday weather temp atemp humidity windspeed casual registered total_rentals
0 2011-01-01 00:00:00 1 0 0 1 9.84 14.395 81 0 3 13 16
1 2011-01-01 01:00:00 1 0 0 1 9.02 13.635 80 0 8 32 40
2 2011-01-01 02:00:00 1 0 0 1 9.02 13.635 80 0 5 27 32
3 2011-01-01 03:00:00 1 0 0 1 9.84 14.395 75 0 3 10 13
4 2011-01-01 04:00:00 1 0 0 1 9.84 14.395 75 0 0 1 1

5 rows × 12 columns


In [3]:
plt.plot(train.casual)
plt.show()
plt.plot(train.registered)
plt.show()
plt.plot(train.total_rentals)
plt.show()



In [20]:
X = train.ix[:,1:9]
dts = [datetime.datetime.strptime(d,'%Y-%m-%d %H:%M:%S') for d in train.datetime]
years = [d.year for d in dts]
months = [d.month for d in dts]
days = [d.day for d in dts]
hours = [d.hour for d in dts]
X.insert(0,'hour',hours)
X.insert(0,'day',days)
X.insert(0,'month',months)
X.insert(0,'year',years)

pca5 = decomposition.PCA(n_components=5)
X_pc = pca5.fit_transform(X)
print X_pc.shape
plt.plot(X_pc[:,1],X_pc[:,4],'.')
plt.show()


(10886, 5)

In [21]:
y = train.ix[:,-1:-4:-1]
y[:2]


Out[21]:
total_rentals registered casual
0 16 13 3
1 40 32 8

2 rows × 3 columns


In [22]:
test_set = pd.read_csv('test.csv')
dts = [datetime.datetime.strptime(d,'%Y-%m-%d %H:%M:%S') for d in test_set.datetime]
years = [d.year for d in dts]
months = [d.month for d in dts]
days = [d.day for d in dts]
hours = [d.hour for d in dts]
print len(months), len(hours)
test_set.insert(1,'hour',hours)
test_set.insert(1,'day',days)
test_set.insert(1,'month',months)
test_set.insert(1,'year',years)
print test_set.shape

pca5 = decomposition.PCA(n_components=5)
test_set_pc = pca5.fit_transform(test_set.ix[:,1:])
print test_set_pc.shape
plt.plot(test_set_pc[:,1],test_set_pc[:,4],'.')
plt.show()


6493 6493
(6493, 13)
(6493, 5)

In [25]:
dt = tree.DecisionTreeRegressor()
cas = reg = y_tot = y_reg = y_cas = np.asarray([])
mean_score = 0
for yr in range(2011,2013):
    for m in range(1,13):
        X_lyr = X_pc[X.year < yr]
        X_tyr = X_pc[X.year == yr][X.month <= m]
        y_lyr = y[X.year < yr]
        y_tyr = y[X.year == yr][X.month <= m]
        X_ = pd.concat([X_lyr,X_tyr])
        y_ = pd.concat([y_lyr,y_tyr])
        dt.fit(X_,y_)
        pred = dt.predict(test_set_pc[test_set.month == m][test_set.year == yr].ix[:,1:])
        # lm.fit(X[X.month <= m][X.year <= yr],y[X.month <= m][X.year <= yr])
        # pred = lm.predict(test_set[test_set.month == m][test_set.year == yr].ix[:,1:])
        mean_score += dt.score(X_,y_)/24
        
        y_tot = np.append(y_tot, train.total_rentals[X.month == m][X.year == yr]*0)
        cas = np.append(cas, train.casual[X.month == m][X.year == yr])
        reg = np.append(reg, train.registered[X.month == m][X.year == yr])
        y_tot = np.append(y_tot, pred[:,0])
        y_reg = np.append(y_reg, pred[:,1])
        y_cas = np.append(y_cas, pred[:,2])
        cas = np.append(cas, [0]*pred.shape[0])
        reg = np.append(reg, [0]*pred.shape[0])

print mean_score
# print lm.alpha_
y_cas[y_cas < 0] = 0
y_reg[y_reg < 0] = 0
y_tot[y_tot < 0] = 0

plt.plot(y_cas)
plt.plot(y_reg)
plt.plot(y_cas+y_reg)
#plt.plot(y_tot,'.')
plt.show()


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-25-82fdd41353ae> in <module>()
      8         y_lyr = y[X.year < yr]
      9         y_tyr = y[X.year == yr][X.month <= m]
---> 10         X_ = pd.concat([X_lyr,X_tyr])
     11         y_ = pd.concat([y_lyr,y_tyr])
     12         dt.fit(X_,y_)

/opt/gpi/local/anaconda/lib/python2.7/site-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity)
    924                        ignore_index=ignore_index, join=join,
    925                        keys=keys, levels=levels, names=names,
--> 926                        verify_integrity=verify_integrity)
    927     return op.get_result()
    928 

/opt/gpi/local/anaconda/lib/python2.7/site-packages/pandas/tools/merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity)
   1000         self.verify_integrity = verify_integrity
   1001 
-> 1002         self.new_axes = self._get_new_axes()
   1003 
   1004     def get_result(self):

/opt/gpi/local/anaconda/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_new_axes(self)
   1232                 if i == self.axis:
   1233                     continue
-> 1234                 new_axes[i] = self._get_comb_axis(i)
   1235         else:
   1236             if len(self.join_axes) != ndim - 1:

/opt/gpi/local/anaconda/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_comb_axis(self, i)
   1259         else:
   1260             try:
-> 1261                 all_indexes = [x._data.axes[i] for x in self.objs]
   1262             except IndexError:
   1263                 types = [type(x).__name__ for x in self.objs]

AttributeError: 'numpy.ndarray' object has no attribute '_data'

In [35]:
plt.plot(y_tot[200:600],'.--')
plt.plot(cas[200:600]+reg[200:600],'.--')
# plt.plot(reg[0:1000],'.')
plt.show()
plt.plot(y_tot,'.')
plt.plot(cas+reg,'.')
# plt.plot(reg,'.')
plt.show()


Write out the submission!


In [36]:
sample_submission = pd.read_csv('sampleSubmission.csv')
new_labels = sample_submission.columns.values
new_labels[-1] = 'total_rentals'
sample_submission.columns = new_labels
print sample_submission.shape

my_submission = sample_submission.copy()
new_labels = my_submission.columns.values
new_labels[-1] = 'total_rentals'
my_submission.columns = new_labels
my_submission.total_rentals = np.round(y_cas+y_reg)
plt.plot(my_submission.total_rentals)
print my_submission.shape
my_submission[:5]


(6493, 2)
(6493, 2)
Out[36]:
datetime total_rentals
0 2011-01-20 00:00:00 5
1 2011-01-20 01:00:00 5
2 2011-01-20 02:00:00 3
3 2011-01-20 03:00:00 3
4 2011-01-20 04:00:00 4

5 rows × 2 columns


In [37]:
new_labels = my_submission.columns.values
new_labels[-1] = 'count'
my_submission.columns = new_labels
my_submission.to_csv('decision_tree_2.csv',index=False)

In [ ]: